Reproducible workflow to assess differentially abundant ASVs and taxa between temperature treatments using Indicator Species Analysis and linear discriminant analysis (LDA) effect size (LEfSe).
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
set.seed(119)
#library(conflicted)
#pacman::p_depends(microbiomeMarker, local = TRUE)
#pacman::p_depends_reverse(microbiomeMarker, local = TRUE)
library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
pacman::p_load(tidyverse, metacoder, hilldiv, patchwork, ampvis2,
agricolae, labdsv, naniar, codefolder, ape,
pairwiseAdonis, microbiome, seqRFLP, DT, microbiomeMarker,
microbiomeMarker, reactable, downloadthis, captioner,
install = FALSE, update = FALSE)
options(scipen=999)
knitr::opts_current$get(c(
"cache",
"cache.path",
"cache.rebuild",
"dependson",
"autodep"
))This workflow contains ASV differential abundance analysis for the high temperature data sets. In order to run the workflow, you …
In this workflow, we use the labdsv package (Roberts and Roberts 2016) to run Dufrene-Legendre Indicator Species Analysis and the microbiomeMarker package (Cao 2020) to run linear discriminant analysis (LDA) effect size (LEfSe) (Segata et al. 2011).
Indicator Analysis calculates the indicator value (fidelity and relative abundance) of species in clusters or types.
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt",
"ssu18_ps_perfect", "ssu18_ps_pime")
indval_pval <- 0.05for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(otu_table(tmp_get))
tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
tmp_row_names <- row.names(tmp_df)
tmp_row_names <- tmp_row_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects()set.seed(1191)
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
pval = tmp_pv, freq = tmp_fr)
tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]
tmp_tax_df <- data.frame(tax_table(get(i)))
tmp_tax_df$ASV_ID <- NULL
tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
class(tmp_sum_tax$group) <- "character"
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
tmp_sum_tax$ASV_ID <- as.character(tmp_sum_tax$ASV_ID)
tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
assign(tmp_res_name, tmp_sum_tax)
rm(list = ls(pattern = "tmp_"))
}
objects()Now we can save a few files and display the data.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_get[,1] <- NULL
tmp_df <- as.data.frame(t(tmp_get))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
#tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_indval_summary")for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()(16S rRNA) Table 1 | Significant results of Indicator Analysis for FULL data set.
(16S rRNA) Table 2 | Significant results of Indicator Analysis for Arbitrary filtered ASV data set.
(16S rRNA) Table 3 | Significant results of Indicator Analysis for PERfect filtered data set.
(16S rRNA) Table 4 | Significant results of Indicator Analysis for PIME data set.
Linear discriminant analysis (LDA) effect size (LEfSe) is a method to support high-dimensional class comparisons with a particular focus on microbial community analyses. LEfSe determines the features (organisms, clades, operational taxonomic units, genes, or functions) most likely to explain differences between classes by coupling standard tests for statistical significance with additional tests encoding biological consistency and effect relevance. Class comparison methods typically predict biomarkers consisting of features that violate a null hypothesis of no difference between classes. The effect size provides an estimation of the magnitude of the observed phenomenon due to each characterizing feature and it is thus a valuable tool for ranking the relevance of different biological aspects and for addressing further investigations and analyses.
First, a little data tidying and subsetting only ASV data from the taxonomy table.
for (i in samp_ps) {
tmp_get <- get(i)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
tax_table(tmp_get) <- cbind(tax_table(tmp_get),
rownames(tax_table(tmp_get)))
colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
assign(tmp_name, tmp_get)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
assign(tmp_name_asv, tmp_get)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", "ssu18_ps_perfect", "ssu18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda,
kw_cutoff = lefse_kw,
wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda,
kw_cutoff = lefse_kw,
wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")for (i in samp_ps) {
tmp_o_tax <- data.frame(tax_table(get(i)))
tmp_o_tax$ASV_ID <- NULL
tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
tibble::remove_rownames()
tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")
tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
assign(tmp_res_name, tmp_sum)
rm(list = ls(pattern = "tmp_"))
}Now we can save a few files and display the data.
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(t(otu_table(tmp_get)))
#tmp_get[,1] <- NULL
#tmp_df <- as.data.frame(t(tmp_otu))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
tmp_df$freq <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}Save a new phyloseq object
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()(16S rRNA) Table 5 | Significant results of LEfSe Analysis for FULL data set.
(16S rRNA) Table 6 | Significant results of LEfSe Analysis for Arbitrary filtered ASV data set.
(16S rRNA) Table 7 | Significant results of LEfSe Analysis for PERfect filtered data set.
(16S rRNA) Table 8 | Significant results of LEfSe Analysis for PIME data set.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 3)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv_viz_trim")
(16S rRNA) Figure 1 | Differentially abundant ASVs from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 2 | Differentially abundant ASVs from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 3 | Differentially abundant ASVs from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 4 | Differentially abundant ASVs from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 3.0.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt[, c(2:4,1)]
tmp_mt <- tmp_mt %>% dplyr::rename(c("group" = 1, "pval" = 3))
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^0$", "C0")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^3$", "W3")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^8$", "W8")
tmp_mt <- tmp_mt %>% tibble::rownames_to_column("marker")
tmp_mt <- tmp_mt %>% tidyr::separate(col = "feature",
into = c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "ASV_ID"),
sep = "\\|\\w__")
tmp_mt$Kingdom <- gsub('k__', '', tmp_mt$Kingdom)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_marker_final"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_marker_final")(16S rRNA) Table 9 | Significant results of LEfSe MARKER Analysis for FULL data set.
(16S rRNA) Table 10 | Significant results of LEfSe MARKER Analysis for Arbitrary filtered ASV data set.
(16S rRNA) Table 11 | Significant results of LEfSe MARKER Analysis for PERfect filtered data set.
(16S rRNA) Table 12 | Significant results of LEfSe MARKER Analysis for PIME data set.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 4)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_all_viz_trim")
(16S rRNA) Figure 5 | Differentially abundant MARKER from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 6 | Differentially abundant MARKER from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 7 | Differentially abundant MARKER from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 8 | Differentially abundant MARKER from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 4.0.
In this section of the workflow we use the microbiomeMarker package to assess the response of taxonomic lineages to soil warming. In the first step we need to fix the selected data set to make it compatible with the various functions. For this analysis we use the PERfect filtered data set.
## FIX ps object
ssu_ps <- ssu18_ps_perfect_rf_all
tmp_tax1 <- data.frame(tax_table(ssu_ps))
#tmp_tax1$ASV_SEQ <- NULL
tmp_rn <- row.names(tmp_tax1)
tmp_tax <- data.frame(lapply(tmp_tax1, function(x) {gsub("\\(|)", "", x)}))
row.names(tmp_tax) <- tmp_rn
identical(row.names(tmp_tax), row.names(tmp_tax1))
#dplyr::filter(tmp_tax, Phylum == "Acidobacteriota")
ps_tax_new <- as.matrix(tmp_tax)
tmp_ps <- phyloseq(otu_table(ssu_ps),
phy_tree(ssu_ps),
tax_table(ps_tax_new),
sample_data(ssu_ps))
ssu_ps <- tmp_ps
phyloseq::rank_names(ssu_ps)
data.frame(tax_table(ssu_ps))Next we run a statistical test for multiple groups using the run_test_multiple_groups function.
ssu_group_anova <- run_test_multiple_groups(ssu_ps, group = "TEMP",
taxa_rank = "all",
method = "anova")
ssu_group_anova@marker_table
marker_table(ssu_group_anova)And then conduct ost hoc pairwise comparisons for multiple groups test using the run_posthoc_test function.
ssu_default_pht <- run_posthoc_test(ssu_ps, group = "TEMP",
method = "tukey", transform = "log10")We can filter out a select taxa and plot the results.
filter(data.frame(ssu_default_pht@result), group_name == "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales") group
3-0.140 141
8-0.140 141
8-3.140 141
group_name
3-0.140 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
8-0.140 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
8-3.140 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
comparions diff_mean pvalue ci_lower ci_upper
3-0.140 3-0 0.007492377 0.645933374 -0.0145296109 0.02951437
8-0.140 8-0 0.030358682 0.008215796 0.0083366941 0.05238067
8-3.140 8-3 0.022866305 0.041743700 0.0008443168 0.04488829
plot_postHocTest(ssu_default_pht, feature = "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales") & theme_bw()
But what we really want to do is get all of the markers that are significant from the analysis, excluding any significant ASVs so we can look at high taxa ranks.
ssu_pht <- ssu_default_pht
ssu18_pht_filt <- filter(data.frame(ssu_pht@result), pvalue <= "0.05")[!grepl("ASV", filter(data.frame(ssu_pht@result), pvalue <= "0.05")$group_name),]
ssu18_pht_filt <- ssu18_pht_filt[!grepl("[a-z]__$", ssu18_pht_filt$group_name),]
ssu18_pht_filt <- unique(ssu18_pht_filt$group_name)
length(ssu18_pht_filt)
save.image("page_build/mmarker_wf.rdata")There are 47 significantly different lineage markers.
[1] "k__Archaea|p__Thermoplasmatota"
[2] "k__Bacteria|p__Actinobacteriota"
[3] "k__Bacteria|p__Bacteroidota"
[4] "k__Bacteria|p__Firmicutes"
[5] "k__Bacteria|p__MBNT15"
[6] "k__Bacteria|p__Myxococcota"
[7] "k__Archaea|p__Thermoplasmatota|c__Thermoplasmata"
[8] "k__Bacteria|p__Acidobacteriota|c__Subgroup_11"
[9] "k__Bacteria|p__Acidobacteriota|c__Subgroup_22"
[10] "k__Bacteria|p__Actinobacteriota|c__MB-A2-108"
[11] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia"
[12] "k__Bacteria|p__Bacteroidota|c__Bacteroidia"
[13] "k__Bacteria|p__Firmicutes|c__Bacilli"
[14] "k__Bacteria|p__Myxococcota|c__bacteriap25"
[15] "k__Bacteria|p__Planctomycetota|c__Pla4_lineage"
[16] "k__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Subgroup_2"
[17] "k__Bacteria|p__Actinobacteriota|c__Acidimicrobiia|o__Microtrichales"
[18] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales"
[19] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales"
[20] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales"
[21] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales"
[22] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales"
[23] "k__Bacteria|p__Myxococcota|c__Polyangia|o__mle1-27"
[24] "k__Bacteria|p__Myxococcota|c__Polyangia|o__Polyangiales"
[25] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales|f__Gaiellaceae"
[26] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Saprospiraceae"
[27] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales|f__Hymenobacteraceae"
[28] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales|f__Flavobacteriaceae"
[29] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales|f__AKYH767"
[30] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Planococcaceae"
[31] "k__Bacteria|p__Myxococcota|c__Polyangia|o__Polyangiales|f__BIrii41"
[32] "k__Bacteria|p__Myxococcota|c__Polyangia|o__Polyangiales|f__Polyangiaceae"
[33] "k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Hyphomonadaceae"
[34] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales|f__Gaiellaceae|g__Gaiella"
[35] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Chitinophagaceae|g__Edaphobaculum"
[36] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Chitinophagaceae|g__UTBCD1"
[37] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales|f__Hymenobacteraceae|g__Adhaeribacter"
[38] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales|f__Flavobacteriaceae|g__Flavobacterium"
[39] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Planococcaceae|g__Chungangia"
[40] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Planococcaceae|g__Paenisporosarcina"
[41] "k__Bacteria|p__Myxococcota|c__Myxococcia|o__Myxococcales|f__Myxococcaceae|g__Corallococcus"
[42] "k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Hyphomonadaceae|g__SWB02"
[43] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Ralstonia"
[44] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Ramlibacter"
[45] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Rubrivivax"
[46] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Nitrosomonadaceae|g__mle1-7"
[47] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Arenimonas"
Now we select a subset of the 47 markers to plot and visualize the results.
ssu_select <- c(
"k__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Subgroup_2",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Saprospiraceae",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales",
"k__Bacteria|p__Myxococcota|c__Polyangia|o__mle1-27",
"k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Rubrivivax",
"k__Bacteria|p__Actinobacteriota|c__Acidimicrobiia|o__Microtrichales",
"k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales",
"k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales",
"k__Bacteria|p__Myxococcota|c__Myxococcia|o__Myxococcales|f__Myxococcaceae|g__Corallococcus",
"k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Ralstonia"
)
(16S rRNA) Figure 9 | The response of select taxa to two years of warming by +3°C and +8°C. Differences assessed for multiple-group pair-wise comparisons using ANOVA followed by Tukey HSD post hoc tests. PERfect filtered read count data was log10 transformed and normalized using total sum scaling (TSS). The centre line of each box plot represents the median, the lower and upper hinges represent the first and third quartiles and whiskers represent + 1.5 the interquartile range. Significant differences denoted by asterisks (* p ≤ 0.05, ** p ≤ 0.01, *** p ≤ 0.001, **** p ≤ 0.0001; ns = not significant).
samp_ps <- c("its18_ps_work", "its18_ps_filt", "its18_ps_perfect", "its18_ps_pime")
indval_pval <- 0.05for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(otu_table(tmp_get))
tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
tmp_row_names <- row.names(tmp_df)
tmp_row_names <- tmp_row_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects()set.seed(1191)
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
pval = tmp_pv, freq = tmp_fr)
tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]
tmp_tax_df <- data.frame(tax_table(get(i)))
tmp_tax_df$ASV_ID <- NULL
tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
class(tmp_sum_tax$group) <- "character"
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
tmp_sum_tax$ASV_ID <- as.character(tmp_sum_tax$ASV_ID)
tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
assign(tmp_res_name, tmp_sum_tax)
rm(list = ls(pattern = "tmp_"))
}Now we can save a few files and display the data.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_get[,1] <- NULL
tmp_df <- as.data.frame(t(tmp_get))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
#tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0",
"W3",
"W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()(ITS) Table 1 | Significant results of Indicator Analysis for FULL data set.
(ITS) Table 2 | Significant results of Indicator Analysis for Arbitrary filtered ASV data set.
(ITS) Table 3 | Significant results of Indicator Analysis for PERfect filtered data set.
(ITS) Table 4 | Significant results of Indicator Analysis for PIME data set.
Details about LEFSE
Remove ASV seq, subset only ASVs
for (i in samp_ps) {
tmp_get <- get(i)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
tax_table(tmp_get) <- cbind(tax_table(tmp_get),
rownames(tax_table(tmp_get)))
colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
assign(tmp_name, tmp_get)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
assign(tmp_name_asv, tmp_get)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")samp_ps <- c("its18_ps_work", "its18_ps_filt", "its18_ps_perfect", "its18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05 for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda, kw_cutoff = lefse_kw, wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda, kw_cutoff = lefse_kw, wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")for (i in samp_ps) {
tmp_o_tax <- data.frame(tax_table(get(i)))
tmp_o_tax$ASV_ID <- NULL
tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
tibble::remove_rownames()
tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")
tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
assign(tmp_res_name, tmp_sum)
rm(list = ls(pattern = "tmp_"))
}Now we can save a few files and display the data.
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(t(otu_table(tmp_get)))
#tmp_get[,1] <- NULL
#tmp_df <- as.data.frame(t(tmp_otu))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
tmp_df$freq <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}Save a new phyloseq object
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()(ITS) Table 5 | Significant results of LEfSe Analysis for FULL data set.
(ITS) Table 6 | Significant results of LEfSe Analysis for Arbitrary filtered ASV data set.
(ITS) Table 7 | Significant results of LEfSe Analysis for PERfect filtered data set.
(ITS) Table 8 | Significant results of LEfSe Analysis for PIME data set.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 3)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv_viz_trim")
(ITS) Figure 1 | Differentially abundant ASVs from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 2 | Differentially abundant ASVs from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 3 | Differentially abundant ASVs from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 4 | Differentially abundant ASVs from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 3.0.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt[, c(2:4,1)]
tmp_mt <- tmp_mt %>% dplyr::rename(c("group" = 1, "pval" = 3))
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^0$", "C0")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^3$", "W3")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^8$", "W8")
tmp_mt <- tmp_mt %>% tibble::rownames_to_column("marker")
tmp_mt <- tmp_mt %>% tidyr::separate(col = "feature",
into = c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "ASV_ID"),
sep = "\\|\\w__")
tmp_mt$Kingdom <- gsub('k__', '', tmp_mt$Kingdom)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_marker_final"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}(ITS) Table 9 | Significant results of LEfSe MARKER Analysis for FULL data set.
(ITS) Table 10 | Significant results of LEfSe MARKER Analysis for Arbitrary filtered ASV data set.
(ITS) Table 11 | Significant results of LEfSe MARKER Analysis for PERfect filtered data set.
(ITS) Table 12 | Significant results of LEfSe MARKER Analysis for PIME data set.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 3.5)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_all_viz_trim")
(ITS) Figure 5 | Differentially abundant MARKER from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 6 | Differentially abundant MARKER from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 7 | Differentially abundant MARKER from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 8 | Differentially abundant MARKER from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 4.0.
The workflow here is basically a carbon copy of the workflow describe above. Presented here for posterity.
## FIX ps object
ps <- its18_ps_perfect_rf_all
tmp_tax1 <- data.frame(tax_table(ps))
tmp_tax1$ASV_SEQ <- NULL
tmp_rn <- row.names(tmp_tax1)
tmp_tax <- data.frame(lapply(tmp_tax1, function(x) {gsub("\\(|)", "", x)}))
row.names(tmp_tax) <- tmp_rn
identical(row.names(tmp_tax), row.names(tmp_tax1))
#dplyr::filter(tmp_tax, Phylum == "Acidobacteriota")
ps_tax_new <- as.matrix(tmp_tax)
tmp_ps <- phyloseq(otu_table(ps),
tax_table(ps_tax_new),
sample_data(ps))
its_ps <- tmp_ps
phyloseq::rank_names(its_ps)
data.frame(tax_table(its_ps))its_group_anova <- run_test_multiple_groups(its_ps, group = "TEMP",
taxa_rank = "all", method = "anova")
its_group_anova@marker_table
marker_table(its_group_anova)its_default_pht <- run_posthoc_test(its_ps,
group = "TEMP", method = "tukey",
transform = "log10")its_pht <- its_default_pht
its18_pht_filt <- filter(data.frame(its_pht@result), pvalue <= "0.05")[!grepl("ASV", filter(data.frame(its_pht@result), pvalue <= "0.05")$group_name),]
its18_pht_filt <- its18_pht_filt[!grepl("[a-z]__$", its18_pht_filt$group_name),]
its18_pht_filt <- unique(its18_pht_filt$group_name)
length(its18_pht_filt)There are 32 significantly different lineage markers.
[1] "k__Fungi"
[2] "k__Fungi|p__Basidiomycota"
[3] "k__Fungi|p__Chytridiomycota"
[4] "k__Fungi|p__Glomeromycota"
[5] "k__Fungi|p__Mortierellomycota"
[6] "k__Fungi|p__Basidiomycota|c__Agaricomycetes"
[7] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes"
[8] "k__Fungi|p__Glomeromycota|c__Glomeromycetes"
[9] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes"
[10] "k__Fungi|p__Rozellomycota|c__Rozellomycotina_cls_Incertae_sedis"
[11] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales"
[12] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales"
[13] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales"
[14] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales"
[15] "k__Fungi|p__Rozellomycota|c__Rozellomycotina_cls_Incertae_sedis|o__GS11"
[16] "k__Fungi|p__Ascomycota|c__Eurotiomycetes|o__Eurotiales|f__Trichocomaceae"
[17] "k__Fungi|p__Ascomycota|c__Pezizomycetes|o__Pezizales|f__Pyronemataceae"
[18] "k__Fungi|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Metschnikowiaceae"
[19] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Xylariales|f__Microdochiaceae"
[20] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales|f__Clavariaceae"
[21] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales|f__Entolomataceae"
[22] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales|f__Sporidiobolaceae"
[23] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales|f__Glomeraceae"
[24] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales|f__Mortierellaceae"
[25] "k__Fungi|p__Ascomycota|c__Eurotiomycetes|o__Eurotiales|f__Trichocomaceae|g__Talaromyces"
[26] "k__Fungi|p__Ascomycota|c__Pezizomycetes|o__Pezizales|f__Pyronemataceae|g__Scutellinia"
[27] "k__Fungi|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Metschnikowiaceae|g__Metschnikowia"
[28] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Hypocreales|f__Nectriaceae|g__Fusarium"
[29] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Xylariales|f__Microdochiaceae|g__Idriella"
[30] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales|f__Sporidiobolaceae|g__Rhodotorula"
[31] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales|f__Glomeraceae|g__Kamienskia"
[32] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales|f__Mortierellaceae|g__Mortierella"

(ITS) Figure 9 | The response of select taxa to two years of warming by +3°C and +8°C. Differences assessed for multiple-group pair-wise comparisons using ANOVA followed by Tukey HSD post hoc tests. PERfect filtered read count data was log10 transformed and normalized using total sum scaling (TSS). The centre line of each box plot represents the median, the lower and upper hinges represent the first and third quartiles and whiskers represent + 1.5 the interquartile range. Significant differences denoted by asterisks (* p ≤ 0.05, ** p ≤ 0.01, *** p ≤ 0.001, **** p ≤ 0.0001; ns = not significant).
Next, we will combine the results of the ISA and LEfSe analyses with the distribution of ASVs across each sample. We are going to do the analysis in anvi’o—an advanced analysis and visualization platform for ’omics data (Eren et al. 2015)—using the anvi-interactive command. Anvi’o likes databases but it also understands that sometimes you do not have a database. So it offers a manual mode. If you type this command you can have a look at the relevant pieces we need for the visualization, specifically those under the headings MANUAL INPUTS and ADDITIONAL STUFF.
anvi-interactive -hMANUAL INPUTS:
Mandatory input parameters to start the interactive interface without
anvi'o databases.
--manual-mode We need this flag to run anvi'o in an ad hoc
manner, i.e., no database.
-f FASTA, --fasta-file FASTA
A FASTA-formatted input file. This is sort of
optional
-d VIEW_DATA, --view-data VIEW_DATA
A TAB-delimited file for view data. This is the ASV
by sample matrix. We need this
-t NEWICK, --tree NEWICK
NEWICK formatted tree structure. How the ASVs are
ordered in our case.
ADDITIONAL STUFF:
Parameters to provide additional layers, views, or layer data.
-V ADDITIONAL_VIEW, --additional-view ADDITIONAL_VIEW
A TAB-delimited file for an additional view to be used
in the interface. This file should contain all split
names, and values for each of them in all samples.
Each column in this file must correspond to a sample
name. Content of this file will be called 'user_view',
which will be available as a new item in the 'views'
combo box in the interface
-A ADDITIONAL_LAYERS, --additional-layers ADDITIONAL_LAYERS
A TAB-delimited file for additional layer info. In
our case this is info about each ASV. The first column
of the file must be the ASV names, and
the remaining columns should be unique attributes.
There are also a few files we generate that cannot be loaded directly. So, in addition to the files that can be loaded when running the interactive, we also have files that must be added to the database created by anvi’o.
Here is a nice tutorial on Working with anvi’o additional data tables. A lot of what we need is covered in this tutorial. To get the most out the visualization we need to create a few files to give anvi’o when we fire up the interactive interface.
Let’s start with the -d or --view-data file. This file needs to be an ASV by sample matrix of read counts. To simplify the visualization, we will use all ASVs represented by 100 or more total reads, including those identified as differentially abundant by the ISA and/or LEfSe.
samp_ps_all <- c("ssu18_ps_filt", "ssu18_ps_pime", "ssu18_ps_perfect")
samp_ps <- c("ssu18_ps_filt", "ssu18_ps_pime", "ssu18_ps_perfect")
samp_ps_org <- c("ssu18_ps_work")
samp_ps_pime <- c("ssu18_ps_pime")
samp_ps_other <- c("ssu18_ps_filt", "ssu18_ps_perfect")
trim_val <- 100
for (i in samp_ps_pime) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
trim_val <- 300
for (i in samp_ps_other) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_trim")for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_df, paste(tmp_path, i, "/", "data.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}Or export a table of transformed data.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_trans <- transform_sample_counts(tmp_get, function(x) 1e5 * {x/sum(x)})
tmp_df <- as.data.frame(t(otu_table(tmp_trans)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
#tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim_tab"))
#assign(tmp_name, tmp_df)
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_df, paste(tmp_path, i, "/", "data_trans.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}Next, we need some additional data about the ASVs to overlay on the visual. This can be anything however what I specifically want are the details of the ISA analysis, total reads, and lineage info. I warn you; this code will get ugly and I urge you to find a better way.
Start with an ASV + lineage table for the ASVs in the new phyloseq object.
for (i in samp_ps) {
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_final")))
tmp_get_indval <- tmp_get_indval %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_indval" = "group") %>%
dplyr::rename("test_indval" = "indval") %>%
dplyr::rename("pval_indval" = "pval")
tmp_get_indval <- tmp_get_indval[,1:5]
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_final")))
tmp_get_lefse <- tmp_get_lefse %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_lefse" = "group") %>%
dplyr::rename("test_lefse" = "lda") %>%
dplyr::rename("pval_lefse" = "pval")
tmp_get_lefse <- tmp_get_lefse[,1:4]
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_otu_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_total <- cbind(tmp_otu_df, total_reads = rowSums(tmp_otu_df))
tmp_total <- rev(tmp_total)[1]
tmp_total <- tmp_total %>% tibble::rownames_to_column("Group")
tmp_tax_df <- as.data.frame(tax_table(tmp_get))
tmp_tax_df$ASV_SEQ <- NULL
tmp_tax_df$ASV_ID <- NULL
tmp_tax_df <- tmp_tax_df %>% tibble::rownames_to_column("Group")
tmp_add_lay <- dplyr::left_join(tmp_tax_df, tmp_total, by = "Group") %>%
dplyr::left_join(., tmp_get_indval, by = "Group") %>%
dplyr::left_join(., tmp_get_lefse, by = "Group")
tmp_add_lay$ASV_ID <- tmp_add_lay$Group
tmp_add_lay <- tmp_add_lay[, c(1,16,8,12,9:11,13:15,2:7)]
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_add_lay, paste(tmp_path, i, "/", "additional_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}Now we want some general data about the samples to overlay on the visual. Again, this can be anything. How about a table of alpha diversity metrics? We actually have such a table that was generated way back up the road. Just need to fix the column names.
metadata_tab <- read.table("files/metadata/tables/metadata.txt",
header = TRUE)
tmp_x <- readRDS("files/alpha/rdata/ssu18_ps_pime.rds")
data.frame(sample_data(tmp_x))
metadata_tab[,c(2:5)] <- list(NULL)
for (i in samp_ps_all) {
tmp_get <- get(i)
tmp_df <- data.frame(sample_data(tmp_get))
tmp_df <- tmp_df[,c(2:9)]
tmp_df <- tmp_df %>% tibble::rownames_to_column("id")
tmp_df <- tmp_df %>% dplyr::rename("no_asvs" = "Observed")
tmp_rc <- data.frame(readcount(tmp_get))
tmp_rc <- tmp_rc %>% tibble::rownames_to_column("id")
tmp_rc <- tmp_rc %>% dplyr::rename("no_reads" = 2)
#identical(tmp_df$id, tmp_rc$id)
tmp_merge <- dplyr::left_join(tmp_df, tmp_rc)
tmp_merge <- tmp_merge[, c(1:6,10,7:9)]
tmp_final <- dplyr::left_join(tmp_merge, metadata_tab)
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_final, paste(tmp_path, i, "/", "additional_views.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
rm(metadata_tab)Turned out this was a little tricky to figure out, but thanks to a little nifty block of code written by guoyanzhao on the phyloseq Issues forum, it was a piece of cake. The code can be altered to take any rank. See the post for an explanation.
Anyway, the goal is to sum each taxon at some rank and present that as a bar chart for each sample in the visualization. Anvi’o has a specific format it needs where each row is a sample and each column is a taxon. Taxa names need the prefix t_<RANK>!. For example, t_class! should be added for Class rank.
pick_rank <- "Phylum"
pick_rank_l <- "phylum"
for (i in samp_ps_all) {
# Make the table
tmp_get <- get(i)
tmp_glom <- tax_glom(tmp_get, taxrank = pick_rank)
tmp_melt <- psmelt(tmp_glom)
tmp_melt[[pick_rank]] <- as.character(tmp_melt[[pick_rank]])
tmp_abund <- aggregate(Abundance ~ Sample + tmp_melt[[pick_rank]], tmp_melt, FUN = sum)
colnames(tmp_abund)[2] <- "tax_rank"
library(reshape2)
tmp_abund <- as.data.frame(reshape::cast(tmp_abund, Sample ~ tax_rank))
tmp_abund <- tibble::remove_rownames(tmp_abund)
tmp_abund <- tibble::column_to_rownames(tmp_abund, "Sample")
# Reorder table column by sum
tmp_layers <- tmp_abund[,names(sort(colSums(tmp_abund), decreasing = TRUE))]
# Add the prefix
tmp_layers <- tmp_layers %>% dplyr::rename_all(function(x) paste0("t_", pick_rank_l,"!", x))
tmp_layers <- tibble::rownames_to_column (tmp_layers, "taxon")
# save the table
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_layers, paste(tmp_path, i, "/", "tax_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}The last piece we need is to generate dendrograms that order the ASVs by their distribution in the samples and the samples by their ASV composition. For this task we will use anvi’o.
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o asv.tre
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o sample.tre \
--transposeThe first command reads the view data we generated above and uses Euclidean distance and Ward linkage for hierarchical clustering of the ASVs. The second command transposes the view data table and then does the same for the samples. There are several distance metrics and linkage methods available. See the help menu for the command by typing anvi-matrix-to-newick -h. Boom.
Alternatively, we can generate dendrograms using phyloseq::distance and hclust.
pick_dist <- "bray"
pick_clust <- "complete"
for (i in samp_ps) {
# Make the table
tmp_get <- get(i)
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist, type = "sample")
tmp_dend <- hclust(tmp_dist, method = pick_clust)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/ssu/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}pick_dist_asv <- "euclidean"
pick_clust_asv <- "ward"
for (i in samp_ps) {
# Make the table
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist_asv, type = "taxa")
tmp_dend <- hclust(tmp_dist, method = pick_clust_asv)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/ssu/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "asv_", pick_dist_asv, "_", pick_clust_asv, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}The ASV tree is fine as is, but the sample tree needs a special format. Specifically, the tree needs to be in a three column, tab delimited, table. This way you can add multiple orderings to the same file and view them all in the interactive. The table needs to be in this format:
| item_name | data_type | data_value |
|---|---|---|
| tree_1 | newick | ((P01_D00_010_W8A:0.0250122,P05_D00_010_W8C:0.02,.. |
| tree_2 | newick | ((((((((OTU14195:0.0712585,OTU13230:0.0712585)0:,… |
| (…) | (…) | (…) |
This is easy to do by hand, but I really need the practice so I will do it in R. Anvi’o is very particular about formatting. For example, if this file ends with a blank line, which it will because when anvi’o made the initial dendrogram it add a new line. We need to get rid of that or we get an error when trying to import the table.
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/ssu/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}# FOR TRANSFORMED DATA
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/ssu/")
tmp_tree <- read_file(paste(tmp_path, i, "/","sample_trans.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_trans.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}
objects()# FOR HCLUST TREE
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/ssu/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c(paste(pick_dist, "_", pick_clust, "_hclust", sep = ""))
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}We don’t need to add a fasta file, but it is a nice way to keep everything in one place. Plus, you can do BLAST searches directly in the interface by right clicking on the ASV of interest, so it is nice to have the sequences.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_tab <- tax_table(tmp_get)
tmp_tab <- tmp_tab[, 7]
tmp_df <- data.frame(row.names(tmp_tab), tmp_tab)
colnames(tmp_df) <- c("ASV_ID", "ASV_SEQ")
tmp_df$ASV_ID <- sub("^", ">", tmp_df$ASV_ID)
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_df, paste(tmp_path, i, "/", i, ".fasta", sep = ""),
sep = "\n", col.names = FALSE, row.names = FALSE,
quote = FALSE, fileEncoding = "UTF-8")
rm(list = ls(pattern = "tmp_"))
}Time to put all of these pieces together. This gets a little tricky since we do not have a database to start with because some of these files can be loaded directly in the interface but some need to be added to a database. When we fire up the interactive in --manual mode, we must give anvi’o the name of a database and it will create that database for us. Then we can shut down the interactive, add the necessary data files, and start back up.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db \
--manualNow we have a new profile database that we can add the sample metadata (additional_layers.txt) and the sample dendrogram (sample.tre) using the command anvi-import-misc-data. These commands add the table to the new profile.db. First, kill the interactive.
anvi-import-misc-data additional_views.txt \
--pan-or-profile-db profile.db \
--target-data-table layers
anvi-import-misc-data sample.tre \
--pan-or-profile-db profile.db \
--target-data-table layer_ordersOne last this is to get the table with the taxonomy total by sample (tax_layers.txt) into the profile database. We will run the same command we just used.
anvi-import-misc-data tax_layers.txt \
--pan-or-profile-db profile.db \
--target-data-table layersIn fact, we could just as easily append the taxonomy total data onto the additional_layers.txt and import in one command. But we didn’t.
With a populated database in hand, we can now begin modifying the visual by running the interactive command again.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db
--fasta-file anvio.fasta \
--manualThe ITS version of the anvi’o workflow is basically a carbon copy of the workflow presented above. It is included here for posterity.
Next, we will combine the results of the ISA and LEfSe analyses with the distribution of ASVs across each sample. We are going to do the analysis in anvi’o—an advanced analysis and visualization platform for ’omics data (Eren et al. 2015)—using the anvi-interactive command. Anvi’o likes databases but it also understands that sometimes you do not have a database. So it offers a manual mode. If you type this command you can have a look at the relevant pieces we need for the visualization, specifically those under the headings MANUAL INPUTS and ADDITIONAL STUFF.
anvi-interactive -hMANUAL INPUTS:
Mandatory input parameters to start the interactive interface without
anvi'o databases.
--manual-mode We need this flag to run anvi'o in an ad hoc
manner, i.e., no database.
-f FASTA, --fasta-file FASTA
A FASTA-formatted input file. This is sort of
optional
-d VIEW_DATA, --view-data VIEW_DATA
A TAB-delimited file for view data. This is the ASV
by sample matrix. We need this
-t NEWICK, --tree NEWICK
NEWICK formatted tree structure. How the ASVs are
ordered in our case.
ADDITIONAL STUFF:
Parameters to provide additional layers, views, or layer data.
-V ADDITIONAL_VIEW, --additional-view ADDITIONAL_VIEW
A TAB-delimited file for an additional view to be used
in the interface. This file should contain all split
names, and values for each of them in all samples.
Each column in this file must correspond to a sample
name. Content of this file will be called 'user_view',
which will be available as a new item in the 'views'
combo box in the interface
-A ADDITIONAL_LAYERS, --additional-layers ADDITIONAL_LAYERS
A TAB-delimited file for additional layer info. In
our case this is info about each ASV. The first column
of the file must be the ASV names, and
the remaining columns should be unique attributes.
There are also a few files we generate that cannot be loaded directly. So, in addition to the files that can be loaded when running the interactive, we also have files that must be added to the database created by anvi’o.
Here is a nice tutorial on Working with anvi’o additional data tables. A lot of what we need is covered in this tutorial. To get the most out the visualization we need to create a few files to give anvi’o when we fire up the interactive interface.
Let’s start with the -d or --view-data file. This file needs to be an ASV by sample matrix of read counts. To simplify the visualization, we will use all ASVs represented by 100 or more total reads, including those identified as differentially abundant by the ISA and/or LEfSe.
samp_ps_all <- c("its18_ps_filt", "its18_ps_pime", "its18_ps_perfect",
"its18_ps_filt_otu", "its18_ps_pime_otu", "its18_ps_perfect_otu",
"its18_ps_work", "its18_ps_work_otu")
samp_ps <- c("its18_ps_filt", "its18_ps_pime", "its18_ps_perfect",
"its18_ps_filt_otu", "its18_ps_pime_otu", "its18_ps_perfect_otu")
samp_ps_org <- c("its18_ps_work", "its18_ps_work_otu")
samp_ps_pime <- c("its18_ps_pime", "its18_ps_pime_otu")
samp_ps_other <- c("its18_ps_filt", "its18_ps_perfect",
"its18_ps_filt_otu", "its18_ps_perfect_otu")
trim_val <- 50
for (i in samp_ps_pime) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
trim_val <- 300
for (i in samp_ps_other) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_trim")
its18_ps_filt_trim
its18_ps_pime_trim
its18_ps_perfect_trim
its18_ps_filt_otu_trim
its18_ps_pime_otu_trim
its18_ps_perfect_otu_trimfor (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_df, paste(tmp_path, i, "/", "data.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
objects()Or export a table of transformed data.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_trans <- transform_sample_counts(tmp_get, function(x) 1e5 * {x/sum(x)})
tmp_df <- as.data.frame(t(otu_table(tmp_trans)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
#tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim_tab"))
#assign(tmp_name, tmp_df)
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_df, paste(tmp_path, i, "/", "data_trans.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
objects()Next, we need some additional data about the ASVs to overlay on the visual. This can be anything however what I specifically want are the details of the ISA analysis, total reads, and lineage info. I warn you; this code will get ugly and I urge you to find a better way.
Start with an ASV + lineage table for the ASVs in the new phyloseq object.
for (i in samp_ps) {
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_final")))
tmp_get_indval <- tmp_get_indval %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_indval" = "group") %>%
dplyr::rename("test_indval" = "indval") %>%
dplyr::rename("pval_indval" = "pval")
tmp_get_indval <- tmp_get_indval[,1:5]
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_final")))
tmp_get_lefse <- tmp_get_lefse %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_lefse" = "group") %>%
dplyr::rename("test_lefse" = "lda") %>%
dplyr::rename("pval_lefse" = "pval")
tmp_get_lefse <- tmp_get_lefse[,1:4]
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_otu_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_total <- cbind(tmp_otu_df, total_reads = rowSums(tmp_otu_df))
tmp_total <- rev(tmp_total)[1]
tmp_total <- tmp_total %>% tibble::rownames_to_column("Group")
tmp_tax_df <- as.data.frame(tax_table(tmp_get))
tmp_tax_df$ASV_SEQ <- NULL
tmp_tax_df$ASV_ID <- NULL
tmp_tax_df <- tmp_tax_df %>% tibble::rownames_to_column("Group")
tmp_add_lay <- dplyr::left_join(tmp_tax_df, tmp_total, by = "Group") %>%
dplyr::left_join(., tmp_get_indval, by = "Group") %>%
dplyr::left_join(., tmp_get_lefse, by = "Group")
tmp_add_lay$ASV_ID <- tmp_add_lay$Group
tmp_add_lay <- tmp_add_lay[, c(1,16,8,12,9:11,13:15,2:7)]
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_add_lay, paste(tmp_path, i, "/", "additional_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}Now we want some general data about the samples to overlay on the visual. Again, this can be anything. How about a table of alpha diversity metrics? We actually have such a table that was generated way back up the road. Just need to fix the column names.
metadata_tab <- read.table("files/metadata/tables/metadata.txt",
header = TRUE)
tmp_x <- readRDS("files/alpha/rdata/its18_ps_pime.rds")
data.frame(sample_data(tmp_x))
metadata_tab[,c(2:5)] <- list(NULL)
for (i in samp_ps_all) {
tmp_get <- get(i)
tmp_df <- data.frame(sample_data(tmp_get))
tmp_df <- tmp_df[,c(2:9)]
tmp_df <- tmp_df %>% tibble::rownames_to_column("id")
tmp_df <- tmp_df %>% dplyr::rename("no_asvs" = "Observed")
tmp_rc <- data.frame(readcount(tmp_get))
tmp_rc <- tmp_rc %>% tibble::rownames_to_column("id")
tmp_rc <- tmp_rc %>% dplyr::rename("no_reads" = 2)
#identical(tmp_df$id, tmp_rc$id)
tmp_merge <- dplyr::left_join(tmp_df, tmp_rc)
tmp_merge <- tmp_merge[, c(1:6,10,7:9)]
tmp_final <- dplyr::left_join(tmp_merge, metadata_tab)
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_final, paste(tmp_path, i, "/", "additional_views.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
rm(metadata_tab)Turned out this was a little tricky to figure out, but thanks to a little nifty block of code written by guoyanzhao on the phyloseq Iitses forum, it was a piece of cake. The code can be altered to take any rank. See the post for an explanation.
Anyway, the goal is to sum each taxon at some rank and present that as a bar chart for each sample in the visualization. Anvi’o has a specific format it needs where each row is a sample and each column is a taxon. Taxa names need the prefix t_<RANK>!. For example, t_class! should be added for Class rank.
pick_rank <- "Order"
pick_rank_l <- "order"
for (i in samp_ps_all) {
# Make the table
tmp_get <- get(i)
tmp_glom <- tax_glom(tmp_get, taxrank = pick_rank)
tmp_melt <- psmelt(tmp_glom)
tmp_melt[[pick_rank]] <- as.character(tmp_melt[[pick_rank]])
tmp_abund <- aggregate(Abundance ~ Sample + tmp_melt[[pick_rank]], tmp_melt, FUN = sum)
colnames(tmp_abund)[2] <- "tax_rank"
library(reshape2)
tmp_abund <- as.data.frame(reshape::cast(tmp_abund, Sample ~ tax_rank))
tmp_abund <- tibble::remove_rownames(tmp_abund)
tmp_abund <- tibble::column_to_rownames(tmp_abund, "Sample")
# Reorder table column by sum
tmp_layers <- tmp_abund[,names(sort(colSums(tmp_abund), decreasing = TRUE))]
# Add the prefix
tmp_layers <- tmp_layers %>% dplyr::rename_all(function(x) paste0("t_", pick_rank_l,"!", x))
tmp_layers <- tibble::rownames_to_column (tmp_layers, "taxon")
# save the table
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_layers, paste(tmp_path, i, "/", "tax_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}The last piece we need is to generate dendrograms that order the ASVs by their distribution in the samples and the samples by their ASV composition. For this task we will use anvi’o.
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o asv.tre
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o sample.tre \
--transposeThe first command reads the view data we generated above and uses Euclidean distance and Ward linkage for hierarchical clustering of the ASVs. The second command transposes the view data table and then does the same for the samples. There are several distance metrics and linkage methods available. See the help menu for the command by typing anvi-matrix-to-newick -h. Boom.
Alternatively, we can generate dendrograms using phyloseq::distance and hclust.
pick_dist <- "bray"
pick_clust <- "complete"
for (i in samp_ps) {
# Make the table
tmp_get <- get(i)
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist, type = "sample")
tmp_dend <- hclust(tmp_dist, method = pick_clust)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/its/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}pick_dist_asv <- "euclidean"
pick_clust_asv <- "ward"
for (i in samp_ps) {
# Make the table
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist_asv, type = "taxa")
tmp_dend <- hclust(tmp_dist, method = pick_clust_asv)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/its/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "asv_", pick_dist_asv, "_", pick_clust_asv, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}The ASV tree is fine as is, but the sample tree needs a special format. Specifically, the tree needs to be in a three column, tab delimited, table. This way you can add multiple orderings to the same file and view them all in the interactive. The table needs to be in this format:
| item_name | data_type | data_value |
|---|---|---|
| tree_1 | newick | ((P01_D00_010_W8A:0.0250122,P05_D00_010_W8C:0.02,.. |
| tree_2 | newick | ((((((((OTU14195:0.0712585,OTU13230:0.0712585)0:,… |
| (…) | (…) | (…) |
This is easy to do by hand, but I really need the practice so I will do it in R. Anvi’o is very particular about formatting. For example, if this file ends with a blank line, which it will because when anvi’o made the initial dendrogram it add a new line. We need to get rid of that or we get an error when trying to import the table.
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/its/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}# FOR TRANSFORMED DATA
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/its/")
tmp_tree <- read_file(paste(tmp_path, i, "/","sample_trans.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_trans.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}
objects()# FOR HCLUST TREE
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/its/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c(paste(pick_dist, "_", pick_clust, "_hclust", sep = ""))
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}We don’t need to add a fasta file, but it is a nice way to keep everything in one place. Plus, you can do BLAST searches directly in the interface by right clicking on the ASV of interest, so it is nice to have the sequences.
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_tab <- tax_table(tmp_get)
tmp_tab <- tmp_tab[, 7]
tmp_df <- data.frame(row.names(tmp_tab), tmp_tab)
colnames(tmp_df) <- c("ASV_ID", "ASV_SEQ")
tmp_df$ASV_ID <- sub("^", ">", tmp_df$ASV_ID)
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_df, paste(tmp_path, i, "/", i, ".fasta", sep = ""),
sep = "\n", col.names = FALSE, row.names = FALSE,
quote = FALSE, fileEncoding = "UTF-8")
rm(list = ls(pattern = "tmp_"))
}Time to put all of these pieces together. This gets a little tricky since we do not have a database to start with because some of these files can be loaded directly in the interface but some need to be added to a database. When we fire up the interactive in --manual mode, we must give anvi’o the name of a database and it will create that database for us. Then we can shut down the interactive, add the necessary data files, and start back up.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db \
--manualNow we have a new profile database that we can add the sample metadata (additional_layers.txt) and the sample dendrogram (sample.tre) using the command anvi-import-misc-data. These commands add the table to the new profile.db. First, kill the interactive.
anvi-import-misc-data additional_views.txt \
--pan-or-profile-db profile.db \
--target-data-table layers
anvi-import-misc-data sample.tre \
--pan-or-profile-db profile.db \
--target-data-table layer_ordersOne last this is to get the table with the taxonomy total by sample (tax_layers.txt) into the profile database. We will run the same command we just used.
anvi-import-misc-data tax_layers.txt \
--pan-or-profile-db profile.db \
--target-data-table layersIn fact, we could just as easily append the taxonomy total data onto the additional_layers.txt and import in one command. But we didn’t.
With a populated database in hand, we can now begin modifying the visual by running the interactive command again.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db
--fasta-file anvio.fasta \
--manualThe source code for this page can be accessed on GitHub by clicking this link. Please note, that in order to process the data and build the website, we needed to run the workflow and get the results. Then hard code the results and turn off the individual commands. So the raw file for this page is a bit messy—you have been warned.
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. Source code is available at https://github.com/sweltr/high-temp/, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".